---
title: "Accuracy"
author: "Kohei Watanabe"
date: "`r format(Sys.time(), '%Y-%m-%d')`"
output: html_document
---

```{r}
knitr::opts_chunk$set(echo = TRUE, dpi = 150, fig.height = 4, fig.width = 8)
dat <- readRDS("../class.RDS")

dat_us <- read.csv("domestic_coded.csv", stringsAsFactors = FALSE)
dat_us$threat <- dat[match(dat_us$docid, dat$docid),]$threat
dat_us$decade <- floor(dat_us$year / 10) * 10
dat_us$threat[is.na(dat_us$threat)] <- FALSE

dat_int <- read.csv("foreign_coded.csv", stringsAsFactors = FALSE)
dat_int$threat <- dat[match(dat_int$docid, dat$docid),]$threat
dat_int$decade <- floor(dat_int$year / 10) * 10
dat_int$threat[is.na(dat_int$threat)] <- FALSE

top <- rev(names(tail(sort(table(dat_int$country.class)), 20)))
```

## Threat classification

### Individual level

Compare classification of individual items by LSS and human.

```{r}
dat_int$threat.correct <- NA
dat_int$threat.correct <- (dat_int$threat.coded == "Yes") == dat_int$threat
dat_int$threat.correct[dat_int$threat.coded == "Unclear"] <- FALSE

accu <- newsmap::accuracy(as.factor(dat_int$threat), 
                          as.factor(dat_int$threat.coded == "Yes"))
accu
summary(accu)

dat_int_top <- subset(dat_int, country.class %in% top)
accu_top <- newsmap::accuracy(as.factor(dat_int_top$threat), 
                              as.factor(dat_int_top$threat.coded == "Yes"))
accu_top
summary(accu_top)
```

### Aggregated level

#### Country

```{r}
tb_threat_class_lss <- table(dat_int$country.class, 
                             dat_int$threat)
tb_threat_class_man <- table(dat_int$country.class, 
                             dat_int$threat.coded == "Yes")
tb_threat_class_all <- table(dat_int$country.class)
dat_threat_class <- data.frame(
    man = tb_threat_class_man[,2],
    lss = tb_threat_class_lss[,2],
    all = as.numeric(tb_threat_class_all)
)
dat_threat_class[top,]
```

```{r}
dat_threat_class$all <- dat_threat_class$all / sum(dat_threat_class$all)
dat_threat_class$lss <- dat_threat_class$lss / sum(dat_threat_class$lss)
dat_threat_class$man <- dat_threat_class$man / sum(dat_threat_class$man)
dat_threat_class$diff_lss <- dat_threat_class$lss - dat_threat_class$man
dat_threat_class$diff_all <- dat_threat_class$all - dat_threat_class$man
dat_threat_class[top,]
```

```{r}
cor(dat_threat_class[,"man"], dat_threat_class[,"lss"])
cor(dat_threat_class[,"man"], dat_threat_class[,"all"])
```

#### Year

```{r}
tb_threat_year_lss <- table(dat_int$decade, 
                            dat_int$threat)
tb_threat_year_man <- table(dat_int$decade, 
                            dat_int$threat.coded == "Yes")
tb_threat_year_all <- table(dat_int$decade)
dat_threat_year <- data.frame(
    man = tb_threat_year_man[,2],
    lss = tb_threat_year_lss[,2],
    all = as.numeric(tb_threat_year_all)
)
dat_threat_year
```

```{r}
dat_threat_year$all <- dat_threat_year$all / sum(dat_threat_year$all)
dat_threat_year$lss <- dat_threat_year$lss / sum(dat_threat_year$lss)
dat_threat_year$man <- dat_threat_year$man / sum(dat_threat_year$man)
dat_threat_year$diff_lss <- dat_threat_year$lss - dat_threat_year$man
dat_threat_year$diff_all <- dat_threat_year$all - dat_threat_year$man
dat_threat_year
```

```{r fig.height=4, fig.width=8}
par(mar = c(2, 4, 2, 2))
matplot(dat_threat_year[,4:5], type = "b", xaxt = "n")
grid()
legend("topright", col = 1:2, legend = colnames(dat_threat_year[,4:5]), lty = 1)
axis(1, seq_len(nrow(dat_threat_year)), rownames(dat_threat_year))
abline(h = 0)
```


```{r}
cor(dat_threat_year[,"man"], dat_threat_year[,"lss"])
cor(dat_threat_year[,"man"], dat_threat_year[,"all"])
```

```{r fig.height=4.5, fig.width=8.5}
par(mfrow = c(1, 2), mar = c(4, 4, 4, 2), font.main = 1)

plot(dat_threat_year[,"man"], dat_threat_year[,"lss"], xlab = "Human", ylab = "LSS",
     main = "Decade")
r <- round(cor(dat_threat_year[,"man"], dat_threat_year[,"lss"]), 2)
legend("topleft", paste0("r=", r), adj = c(0.5, 0), bty = "n")
abline(lm(lss ~ man, dat_threat_year))

plot(dat_threat_class[,"man"], dat_threat_class[,"lss"], xlab = "Human", ylab = "LSS",
     main = "Country")
r <- round(cor(dat_threat_class[,"man"], dat_threat_class[,"lss"]), 2)
legend("topleft", paste0("r=", r), adj = c(0.5, 0), bty = "n")
abline(lm(lss ~ man, dat_threat_class))
```

## Country classification

### US (domestic)

```{r}
tb_us <- table(dat_us$coded)
tb_us / sum(tb_us)
```

```{r}
tb_us_year <- table(dat_us$decade, dat_us$coded)
tb_us_year / rowSums(tb_us_year)
```

### Other countries

```{r}
dat_int$country.name.corect <- NA
dat_int$country.name.corect[dat_int$country.name.coded == "Yes"] <- TRUE
dat_int$country.name.corect[dat_int$country.name.coded == "No"] <- FALSE
dat_int$country.name.corect[dat_int$country.name.coded == "Unclear"] <- NA

tb_int <- table(dat_int$country.name.corect)
tb_int / sum(tb_int)
```

```{r}
tb_int_year <- table(dat_int$decade, dat_int$country.name.corect)
tb_int_year / rowSums(tb_int_year)
```

```{r}
tb_int <- table(dat_int$country.class, dat_int$country.name.corect)
tb_int[top,] / rowSums(tb_int[top,])
```

```{r}
mean(tb_int[top,2] / rowSums(tb_int[top,]))
```

